Exploratory of variables

cancer_df = read_csv("./data/Cancer_Registry.csv") %>% 
  janitor::clean_names() %>% 
  select(target_death_rate, everything()) %>% 
  separate(geography, c("county", "state"), sep = ",") %>% 
  mutate(county = as.factor(county), 
         state = as.factor(state),
         pct_case_count = avg_ann_count / pop_est2015*100000,
         pct_hs = pct_no_hs18_24 + pct_hs18_24, 
         pct_bach_deg = pct_bach_deg18_24 + pct_bach_deg25_over) %>%
  filter(median_age<100) %>% 
  select(target_death_rate, pct_case_count, everything())
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   avgDeathsPerYear = col_integer(),
##   medIncome = col_integer(),
##   popEst2015 = col_integer(),
##   binnedInc = col_character(),
##   Geography = col_character()
## )
## See spec(...) for full column specifications.

Percentage of annul case dignosed count plot

plot_count_pct =
  cancer_df %>% 
  ggplot(aes(y = pct_case_count, x = target_death_rate, color = state)) +
  geom_point() 
  #geom_smooth(se = F)
ggplotly(plot_count_pct)

Incidence rate plot

plot_incidence = 
  cancer_df %>% 
  ggplot(aes(x = incidence_rate, y = target_death_rate, color = state)) +
  geom_point() 
  #geom_smooth(se = F)
  ggplotly(plot_incidence)
# Influential points in the dataset, state Flordia and Virginia.

Income plot

plot_income = 
  cancer_df %>% 
  ggplot(aes(x = med_income, y = target_death_rate, color = state)) +
  geom_point() 
  #geom_smooth(se = F)
  ggplotly(plot_income)

Age plots

plot_age_1 =
  cancer_df %>% 
  ggplot(aes(x = median_age, y = target_death_rate)) +
  geom_point() +
  geom_smooth(se = F)
ggplotly(plot_age_1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# error data in this column, larger than 100

cancer_df %>%
  filter(median_age < 100) %>% 
  ggplot(aes(x = median_age)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot_age_2 =
  cancer_df %>% 
  ggplot(aes(x = median_age_male, y = target_death_rate)) +
  geom_point() +
  geom_smooth(se = F)
ggplotly(plot_age_2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_age_3 =
  cancer_df %>% 
  ggplot(aes(x = median_age_female, y = target_death_rate)) +
  geom_point() +
  geom_smooth(se = F)
ggplotly(plot_age_3)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cancer_df %>% 
  select(-county, -state, -binned_inc) %>% 
  cor() %>% 
  as.tibble() 
## # A tibble: 35 x 35
##    target_death_ra… pct_case_count avg_ann_count avg_deaths_per_…
##               <dbl>          <dbl>         <dbl>            <dbl>
##  1          1             -0.0551        -0.143           -0.0904
##  2         -0.0551         1              0.161           -0.0589
##  3         -0.143          0.161          1                0.940 
##  4         -0.0904        -0.0589         0.940            1     
##  5          0.448          0.0230         0.0742           0.0631
##  6         -0.428          0.0278         0.269            0.223 
##  7         -0.119         -0.0518         0.927            0.978 
##  8          0.429         -0.123         -0.135           -0.0667
##  9         -0.0225        -0.00419        0.0819           0.0633
## 10         -0.00429        0.124         -0.122           -0.145 
## # ... with 25 more rows, and 31 more variables: incidence_rate <dbl>,
## #   med_income <dbl>, pop_est2015 <dbl>, poverty_percent <dbl>,
## #   study_per_cap <dbl>, median_age <dbl>, median_age_male <dbl>,
## #   median_age_female <dbl>, avg_household_size <dbl>,
## #   percent_married <dbl>, pct_no_hs18_24 <dbl>, pct_hs18_24 <dbl>,
## #   pct_some_col18_24 <dbl>, pct_bach_deg18_24 <dbl>, pct_hs25_over <dbl>,
## #   pct_bach_deg25_over <dbl>, pct_employed16_over <dbl>,
## #   pct_unemployed16_over <dbl>, pct_private_coverage <dbl>,
## #   pct_private_coverage_alone <dbl>, pct_emp_priv_coverage <dbl>,
## #   pct_public_coverage <dbl>, pct_public_coverage_alone <dbl>,
## #   pct_white <dbl>, pct_black <dbl>, pct_asian <dbl>,
## #   pct_other_race <dbl>, pct_married_households <dbl>, birth_rate <dbl>,
## #   pct_hs <dbl>, pct_bach_deg <dbl>
lm(target_death_rate ~  incidence_rate + med_income * pct_bach_deg25_over + pct_unemployed16_over*poverty_percent + pct_public_coverage_alone , data = cancer_df) %>% 
  summary()
## 
## Call:
## lm(formula = target_death_rate ~ incidence_rate + med_income * 
##     pct_bach_deg25_over + pct_unemployed16_over * poverty_percent + 
##     pct_public_coverage_alone, data = cancer_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -114.865  -11.770    0.047   11.648  137.687 
## 
## Coefficients:
##                                         Estimate Std. Error t value
## (Intercept)                            1.035e+02  8.133e+00  12.730
## incidence_rate                         2.161e-01  6.775e-03  31.899
## med_income                            -4.662e-04  1.205e-04  -3.870
## pct_bach_deg25_over                   -2.719e+00  2.479e-01 -10.967
## pct_unemployed16_over                  1.217e+00  2.868e-01   4.244
## poverty_percent                        7.786e-01  1.841e-01   4.229
## pct_public_coverage_alone              1.924e-01  1.145e-01   1.680
## med_income:pct_bach_deg25_over         2.148e-05  4.502e-06   4.771
## pct_unemployed16_over:poverty_percent -3.050e-02  1.229e-02  -2.481
##                                       Pr(>|t|)    
## (Intercept)                            < 2e-16 ***
## incidence_rate                         < 2e-16 ***
## med_income                            0.000111 ***
## pct_bach_deg25_over                    < 2e-16 ***
## pct_unemployed16_over                 2.26e-05 ***
## poverty_percent                       2.41e-05 ***
## pct_public_coverage_alone             0.093006 .  
## med_income:pct_bach_deg25_over        1.92e-06 ***
## pct_unemployed16_over:poverty_percent 0.013149 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.13 on 3008 degrees of freedom
## Multiple R-squared:  0.4747, Adjusted R-squared:  0.4733 
## F-statistic: 339.8 on 8 and 3008 DF,  p-value: < 2.2e-16